#Basic python library which need to import
import pandas as pd
import numpy as np
#Date stuff
from datetime import datetime
from datetime import timedelta
#Library for Nice graphing
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sn
%matplotlib inline
#Library for statistics operation
import scipy.stats as stats
# Date Time library
from datetime import datetime
#Machine learning Library
import statsmodels.api as sm
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.nan)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# reading data into dataframe
credit= pd.read_csv("CC_GENERAL.csv")
credit.head()
credit.info()
# Find the total number of missing values in the dataframe
print ("\nMissing values : ", credit.isnull().sum().values.sum())
# printing total numbers of Unique value in the dataframe.
print ("\nUnique values : \n",credit.nunique())
credit.shape
# Intital descriptive analysis of data.
credit.describe()
- Since there are missing values in the data so we are imputing them with median.
credit.isnull().any()
# CREDIT_LIMIT and MINIMUM_PAYMENTS has missing values so we need to remove with median.
credit['CREDIT_LIMIT'].fillna(credit['CREDIT_LIMIT'].median(),inplace=True)
credit['CREDIT_LIMIT'].count()
credit['MINIMUM_PAYMENTS'].median()
credit['MINIMUM_PAYMENTS'].fillna(credit['MINIMUM_PAYMENTS'].median(),inplace=True)
# Now again check the missing values.
credit.isnull().any()
1. Monthly average purchase and cash advance amount
credit['Monthly_avg_purchase']=credit['PURCHASES']/credit['TENURE']
print(credit['Monthly_avg_purchase'].head(),'\n ',
credit['TENURE'].head(),'\n', credit['PURCHASES'].head())
credit['Monthly_cash_advance']=credit['CASH_ADVANCE']/credit['TENURE']
credit[credit['ONEOFF_PURCHASES']==0]['ONEOFF_PURCHASES'].count()
credit.loc[:,['ONEOFF_PURCHASES','INSTALLMENTS_PURCHASES']]
credit[(credit['ONEOFF_PURCHASES']==0) & (credit['INSTALLMENTS_PURCHASES']==0)].shape
credit[(credit['ONEOFF_PURCHASES']>0) & (credit['INSTALLMENTS_PURCHASES']>0)].shape
credit[(credit['ONEOFF_PURCHASES']>0) & (credit['INSTALLMENTS_PURCHASES']==0)].shape
credit[(credit['ONEOFF_PURCHASES']==0) & (credit['INSTALLMENTS_PURCHASES']>0)].shape
As per above detail we found out that there are 4 types of purchase behaviour in the data set. So we need to derive a categorical variable based on their behaviour
def purchase(credit):
if (credit['ONEOFF_PURCHASES']==0) & (credit['INSTALLMENTS_PURCHASES']==0):
return 'none'
if (credit['ONEOFF_PURCHASES']>0) & (credit['INSTALLMENTS_PURCHASES']>0):
return 'both_oneoff_installment'
if (credit['ONEOFF_PURCHASES']>0) & (credit['INSTALLMENTS_PURCHASES']==0):
return 'one_off'
if (credit['ONEOFF_PURCHASES']==0) & (credit['INSTALLMENTS_PURCHASES']>0):
return 'istallment'
credit['purchase_type']=credit.apply(purchase,axis=1)
credit['purchase_type'].value_counts()
credit['limit_usage']=credit.apply(lambda x: x['BALANCE']/x['CREDIT_LIMIT'], axis=1)
credit['limit_usage'].head()
credit['PAYMENTS'].isnull().any()
credit['MINIMUM_PAYMENTS'].isnull().value_counts()
credit['MINIMUM_PAYMENTS'].describe()
credit['payment_minpay']=credit.apply(lambda x:x['PAYMENTS']/x['MINIMUM_PAYMENTS'],axis=1)
credit['payment_minpay']
# log tranformation
cr_log=credit.drop(['CUST_ID','purchase_type'],axis=1).applymap(lambda x: np.log(x+1))
cr_log.describe()
col=['BALANCE','PURCHASES','CASH_ADVANCE','TENURE','PAYMENTS','MINIMUM_PAYMENTS','PRC_FULL_PAYMENT','CREDIT_LIMIT']
cr_pre=cr_log[[x for x in cr_log.columns if x not in col ]]
cr_pre.columns
cr_log.columns
x=credit.groupby('purchase_type').apply(lambda x: np.mean(x['payment_minpay']))
type(x)
x.values
ax.barh?
fig,ax=plt.subplots()
ax.barh(y=range(len(x)), width=x.values,align='center')
ax.set(yticks= np.arange(len(x)),yticklabels = x.index);
plt.title('Mean payment_minpayment ratio for each purchse type')
credit.describe()
credit[credit['purchase_type']=='n']
credit.groupby('purchase_type').apply(lambda x: np.mean(x['Monthly_cash_advance'])).plot.barh()
plt.title('Average cash advance taken by customers of different Purchase type : Both, None,Installment,One_Off')
credit.groupby('purchase_type').apply(lambda x: np.mean(x['limit_usage'])).plot.barh()
cre_original=pd.concat([credit,pd.get_dummies(credit['purchase_type'])],axis=1)
We do have some categorical data which need to convert with the help of dummy creation
# creating Dummies for categorical variable
cr_pre['purchase_type']=credit.loc[:,'purchase_type']
pd.get_dummies(cr_pre['purchase_type'])
cr_dummy=pd.concat([cr_pre,pd.get_dummies(cr_pre['purchase_type'])],axis=1)
l=['purchase_type']
cr_dummy=cr_dummy.drop(l,axis=1)
cr_dummy.isnull().any()
cr_dummy.info()
cr_dummy.head(3)
sns.heatmap(cr_dummy.corr())
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
cr_dummy.shape
cr_scaled=sc.fit_transform(cr_dummy)
cr_scaled
With the help of principal component analysis we will reduce features
from sklearn.decomposition import PCA
cr_dummy.shape
#We have 17 features so our n_component will be 17.
pc=PCA(n_components=17)
cr_pca=pc.fit(cr_scaled)
#Lets check if we will take 17 component then how much varience it explain. Ideally it should be 1 i.e 100%
sum(cr_pca.explained_variance_ratio_)
var_ratio={}
for n in range(2,18):
pc=PCA(n_components=n)
cr_pca=pc.fit(cr_scaled)
var_ratio[n]=sum(cr_pca.explained_variance_ratio_)
var_ratio
Since 6 components are explaining about 90% variance so we select 5 components
pc=PCA(n_components=6)
p=pc.fit(cr_scaled)
cr_scaled.shape
p.explained_variance_
np.sum(p.explained_variance_)
np.sum(p.explained_variance_)
var_ratio
pd.Series(var_ratio).plot()
Since 5 components are explaining about 87% variance so we select 5 components
cr_scaled.shape
pc_final=PCA(n_components=6).fit(cr_scaled)
reduced_cr=pc_final.fit_transform(cr_scaled)
dd=pd.DataFrame(reduced_cr)
dd.head()
So initially we had 17 variables now its 5 so our variable go reduced
dd.shape
col_list=cr_dummy.columns
col_list
pd.DataFrame(pc_final.components_.T, columns=['PC_' +str(i) for i in range(6)],index=col_list)
# Factor Analysis : variance explained by each component-
pd.Series(pc_final.explained_variance_ratio_,index=['PC_'+ str(i) for i in range(6)])
Based on the intuition on type of purchases made by customers and their distinctive behavior exhibited based on the purchase_type (as visualized above in Insights from KPI) , I am starting with 4 clusters.
from sklearn.cluster import KMeans
km_4=KMeans(n_clusters=4,random_state=123)
km_4.fit(reduced_cr)
km_4.labels_
pd.Series(km_4.labels_).value_counts()
Here we donot have known k value so we will find the K. To do that we need to take a cluster range between 1 and 21.
cluster_range = range( 1, 21 )
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters )
clusters.fit( reduced_cr )
cluster_errors.append( clusters.inertia_ )# clusters.inertia_ is basically cluster error here.
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors } )
clusters_df[0:21]
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
From above graph we will find elbow range. here it is 4,5,6
from sklearn import metrics
# calculate SC for K=3 through K=12
k_range = range(2, 21)
scores = []
for k in k_range:
km = KMeans(n_clusters=k, random_state=1)
km.fit(reduced_cr)
scores.append(metrics.silhouette_score(reduced_cr, km.labels_))
scores
# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)
color_map={0:'r',1:'b',2:'g',3:'y'}
label_color=[color_map[l] for l in km_4.labels_]
plt.figure(figsize=(7,7))
plt.scatter(reduced_cr[:,0],reduced_cr[:,1],c=label_color,cmap='Spectral',alpha=0.1)
It is very difficult to draw iddividual plot for cluster, so we will use pair plot which will provide us all graph in one shot. To do that we need to take following steps
df_pair_plot=pd.DataFrame(reduced_cr,columns=['PC_' +str(i) for i in range(6)])
df_pair_plot['Cluster']=km_4.labels_ #Add cluster column in the data frame
df_pair_plot.head()
#pairwise relationship of components on the data
sns.pairplot(df_pair_plot,hue='Cluster', palette= 'Dark2', diag_kind='kde',size=1.85)
It shows that first two components are able to indentify clusters
To interprate result we need to use our data frame
# Key performace variable selection . here i am taking varibales which we will use in derving new KPI.
#We can take all 17 variables but it will be difficult to interprate.So are are selecting less no of variables.
col_kpi=['PURCHASES_TRX','Monthly_avg_purchase','Monthly_cash_advance','limit_usage','CASH_ADVANCE_TRX',
'payment_minpay','both_oneoff_installment','istallment','one_off','none','CREDIT_LIMIT']
cr_pre.describe()
# Conactenating labels found through Kmeans with data
cluster_df_4=pd.concat([cre_original[col_kpi],pd.Series(km_4.labels_,name='Cluster_4')],axis=1)
cluster_df_4.head()
# Mean value gives a good indication of the distribution of data. So we are finding mean value for each variable for each cluster
cluster_4=cluster_df_4.groupby('Cluster_4')\
.apply(lambda x: x[col_kpi].mean()).T
cluster_4
fig,ax=plt.subplots(figsize=(15,10))
index=np.arange(len(cluster_4.columns))
cash_advance=np.log(cluster_4.loc['Monthly_cash_advance',:].values)
credit_score=(cluster_4.loc['limit_usage',:].values)
purchase= np.log(cluster_4.loc['Monthly_avg_purchase',:].values)
payment=cluster_4.loc['payment_minpay',:].values
installment=cluster_4.loc['istallment',:].values
one_off=cluster_4.loc['one_off',:].values
bar_width=.10
b1=plt.bar(index,cash_advance,color='b',label='Monthly cash advance',width=bar_width)
b2=plt.bar(index+bar_width,credit_score,color='m',label='Credit_score',width=bar_width)
b3=plt.bar(index+2*bar_width,purchase,color='k',label='Avg purchase',width=bar_width)
b4=plt.bar(index+3*bar_width,payment,color='c',label='Payment-minpayment ratio',width=bar_width)
b5=plt.bar(index+4*bar_width,installment,color='r',label='installment',width=bar_width)
b6=plt.bar(index+5*bar_width,one_off,color='g',label='One_off purchase',width=bar_width)
plt.xlabel("Cluster")
plt.title("Insights")
plt.xticks(index + bar_width, ('Cl-0', 'Cl-1', 'Cl-2', 'Cl-3'))
plt.legend()
Insights
# Percentage of each cluster in the total customer base
s=cluster_df_4.groupby('Cluster_4').apply(lambda x: x['Cluster_4'].value_counts())
print (s),'\n'
per=pd.Series((s.values.astype('float')/ cluster_df_4.shape[0])*100,name='Percentage')
print ("Cluster -4 "),'\n'
print (pd.concat([pd.Series(s.values,name='Size'),per],axis=1))
km_5=KMeans(n_clusters=5,random_state=123)
km_5=km_5.fit(reduced_cr)
km_5.labels_
pd.Series(km_5.labels_).value_counts()
plt.figure(figsize=(7,7))
plt.scatter(reduced_cr[:,0],reduced_cr[:,1],c=km_5.labels_,cmap='Spectral',alpha=0.5)
plt.xlabel('PC_0')
plt.ylabel('PC_1')
cluster_df_5=pd.concat([cre_original[col_kpi],pd.Series(km_5.labels_,name='Cluster_5')],axis=1)
# Finding Mean of features for each cluster
cluster_df_5.groupby('Cluster_5')\
.apply(lambda x: x[col_kpi].mean()).T
we have a group of customers (cluster 2) having highest avergae purchases but there is Cluster 4 also having highest cash advance & secong highest purchase behaviour but their type of purchases are same.
Cluster 0 and Cluster 4 are behaving similar in terms of Credit_limit and have cash transactions is on higher side
So we don't have quite distinguishable characteristics with 5 clusters,
s1=cluster_df_5.groupby('Cluster_5').apply(lambda x: x['Cluster_5'].value_counts())
print (s1)
# percentage of each cluster
print ("Cluster-5"),'\n'
per_5=pd.Series((s1.values.astype('float')/ cluster_df_5.shape[0])*100,name='Percentage')
print (pd.concat([pd.Series(s1.values,name='Size'),per_5],axis=1))
km_6=KMeans(n_clusters=6).fit(reduced_cr)
km_6.labels_
color_map={0:'r',1:'b',2:'g',3:'c',4:'m',5:'k'}
label_color=[color_map[l] for l in km_6.labels_]
plt.figure(figsize=(7,7))
plt.scatter(reduced_cr[:,0],reduced_cr[:,1],c=label_color,cmap='Spectral',alpha=0.5)
cluster_df_6 = pd.concat([cre_original[col_kpi],pd.Series(km_6.labels_,name='Cluster_6')],axis=1)
six_cluster=cluster_df_6.groupby('Cluster_6').apply(lambda x: x[col_kpi].mean()).T
six_cluster
fig,ax=plt.subplots(figsize=(15,10))
index=np.arange(len(six_cluster.columns))
cash_advance=np.log(six_cluster.loc['Monthly_cash_advance',:].values)
credit_score=(six_cluster.loc['limit_usage',:].values)
purchase= np.log(six_cluster.loc['Monthly_avg_purchase',:].values)
payment=six_cluster.loc['payment_minpay',:].values
installment=six_cluster.loc['istallment',:].values
one_off=six_cluster.loc['one_off',:].values
bar_width=.10
b1=plt.bar(index,cash_advance,color='b',label='Monthly cash advance',width=bar_width)
b2=plt.bar(index+bar_width,credit_score,color='m',label='Credit_score',width=bar_width)
b3=plt.bar(index+2*bar_width,purchase,color='k',label='Avg purchase',width=bar_width)
b4=plt.bar(index+3*bar_width,payment,color='c',label='Payment-minpayment ratio',width=bar_width)
b5=plt.bar(index+4*bar_width,installment,color='r',label='installment',width=bar_width)
b6=plt.bar(index+5*bar_width,one_off,color='g',label='One_off purchase',width=bar_width)
plt.xlabel("Cluster")
plt.title("Insights")
plt.xticks(index + bar_width, ('Cl-0', 'Cl-1', 'Cl-2', 'Cl-3','Cl-4','Cl-5'))
plt.legend()
cash_advance=np.log(six_cluster.loc['Monthly_cash_advance',:].values)
credit_score=list(six_cluster.loc['limit_usage',:].values)
cash_advance
from sklearn.metrics import calinski_harabaz_score,silhouette_score
score={}
score_c={}
for n in range(3,10):
km_score=KMeans(n_clusters=n)
km_score.fit(reduced_cr)
score_c[n]=calinski_harabaz_score(reduced_cr,km_score.labels_)
score[n]=silhouette_score(reduced_cr,km_score.labels_)
pd.Series(score).plot()
pd.Series(score_c).plot()
Performance metrics also suggest that K-means with 4 cluster is able to show distinguished characteristics of each cluster.
Insights with 4 Clusters